*_____________________________________________________________________________________________________________________________________________________
*
**# 0. ENSURE THAT NO MISSING DATA
* (This causes problems for the bootstrapping, so need to ensure done at this stage so that disability prevalence is comparable across measures)
*_____________________________________________________________________________________________________________________________________________________

// Sample selection
drop if missing(${disvar})					// CHANGE FOR EACH SURVEY!
* Checking age is comparable across country
	/* Initial checks
	version 14
	table country_waveN if !missing(${pweight}), c(min ragey max ragey)
	version 17
	*/
	* Using this to decide which countries to include
	keep if ${countrywaves}
	keep if ${agerange}
	/* And checking again
	table country if !missing(${pweight}), c(min ragey max ragey)
	tab country ragey if !missing(${pweight}), row nof
	*/
* Dropping item-missing cases
local strippedvars = subinstr("${controls2} ${empvar} ${allIRTbinary} ${allIRTordinal} ${allIRTnominal}", "i.", "", .) 	// for most missing vars
	/* I checked countries 10 (ES), 22 (NL), 2 (BE) and 17 (IT)
		misschk `strippedvars' if countrynum==17		// IT
	*/
egen misscount = rowmiss(`strippedvars')
	* Checks
	log using "${dodir}\Outputs\2_disweights_missingness.txt", text replace name(missingness) 
		dis in red "ITEM MISSINGINESS (excluding ${disvar})"
		misschk `strippedvars'
		/* Further checks
		recode misscount (2/max=2)
		tab country misscount, row nof
		*/
	log close missingness
	* Actually doing the drops & cleaning up
	drop if misscount~=0					
	drop misscount
* Further bits of missingness to do 
if ("${if}"~="")		keep ${if}			// So that bootstrap is not resampling people out of sample by definition




*_____________________________________________________________________________________________________________________________________________________
*
**# 1. PREDICTED DISABILITY WEIGHTS
* No variation around the coefficients, as want to get sampling uncertainty not coefficient uncertainty
*_____________________________________________________________________________________________________________________________________________________

	
// Getting a working list of control variables (both continuous and factor variables)
* Expanding any factors being used as controls (when created, this was country##education)
if "${disweight_factors}"~=""	{			
	capture drop I*
	local xifac_controls = subinstr("${disweight_factors}", "##", "*", .)		// different notation for interactions
	xi `xifac_controls', prefix(I)
	unab newvars: I* 
/**/ }
local disweight_controls "${disweight_controls} `newvars'"
dis "`disweight_controls'"
/*  A REVISED LIST OF CONTROL VARIABLES IN disweight_controls2:
	The following section enables the controls to be used in the models, and then set to their mean value for PREDICT
	NOTE: setting binary variables to their mean is slightly odd, but works well for ensuring that predicted disability and observed 
		disability have the same prevalence among people with observations on both (which is what I'm aiming for).  
	This is also done for COUNTRY (if this is included in the controls) - to try and get the actual association of impairments with disability,
		you need suitable controls. */
global disweight_controls2 ""
foreach var in `disweight_controls' {
	capture drop `var'_pred
		gen `var'_pred = `var'
		label var `var'_pred	"`var' copy for prediction (actual value for model, mean for prediction)"
	capture drop `var'_mean
		quietly svy: mean `var'_pred  
		matrix working = e(b)
		gen `var'_mean = working[1,1]
		label var `var'_mean "`var' mean value in full sample"
		sum `var'_mean			// a check
	global disweight_controls2 "${disweight_controls2} `var'_pred" 	// This is for the regression model below
/**/ } 
	

// Creating the regression-based weights
eststo comparator:		svy: logit ${disvar} ${IRTindicators} 	${disweight_controls2}		// Simply by way of comparison to show the negative weights etc below
eststo ${disvar}:		svy: logit ${disvar} ${indicators} 		${disweight_controls2}
	* Setting controls - including COUNTRY! - to be the same value, so not taken into account in prediction
	foreach var in `disweight_controls' {
		replace `var'_pred = `var'_mean 
	/**/ }
	* The prediction
	predict p_predicted                                                               	// The probability of disability
		label var p_predicted "PREDICTED VALUES of ${disvar} from regression weights"
		* Ajudstment so that the mean value of this is the same as ${disvar}
		svy: mean ${disvar} p_predicted if ~missing(${disvar}) & ~missing(p_predicted)		// To show that prevalence is very close (if not quite identical) - accounted for below
		local adjustment = r(table)[1,1] / r(table)[1,2]
		replace p_predicted = p_predicted*`adjustment'
	drop *_pred *_mean 
	capture drop I*
	

* SAVING & OUTPUTTING
esttab ${disvar} comparator using "${dodir}\Outputs\2_dis_weights_predicted_${disvar}.csv", csv replace not p nopar nostar nonum nodepvars b(%5.4f) ///
	aic(%6.0fc) bic(%6.0fc) ///		// (+ 0.10 * 0.05 ** 0.01)  
	addnotes("First column is the specification used in the bootstrap" "Second column is to show negative weights if dont change spec, using all vars in IRT spec") 


*_____________________________________________________________________________________________________________________________________________________
*
**# 2. WASHINGTON GROUP-STLYE MEASURE NOT USED HERE
/*_____________________________________________________________________________________________________________________________________________________
*/

*_____________________________________________________________________________________________________________________________________________________
*
**# 3. IRT MEASURE
* (Weighted results simply wouldn't work, whether I used SVY: IRT or simply [pw=${pweight}])
*_____________________________________________________________________________________________________________________________________________________

// Hybrid IRT model, which includes ordinal versions
if "${IRTbinary}"!=""		local hybridsyntax1 "(grm ${IRTordinal}) (2pl ${IRTbinary})"
if "${IRTbinary}"==""		local hybridsyntax1 "(grm ${IRTordinal})"
if "${IRTnominal}"!=""		local hybridsyntax1 "`hybridsyntax1' (nrm ${IRTnominal})"
dis `"`hybridsyntax1'"'
eststo IRT:	irt hybrid 	`hybridsyntax1'			// , vce(cluster mergeid)			
	if `e(converged)'==0 	pause Model not converged
	estat report, byparm sort(b)
	matrix IRTout = r(table)'
	matrix IRTout = IRTout[1...,1], IRTout[1...,5], IRTout[1...,6]
predict p_IRT, latent ebmeans 	// Empirical Bayes posterior estimate of latent variable
	label var p_IRT "DISABILITY SCORE using IRT measure (hybrid / 2-parameter)"

	
// Hybrid IRT model, with longer list of variables
if "${allIRTbinary}"!=""		local hybridsyntax2 "(grm ${allIRTordinal}) (2pl ${allIRTbinary})"
if "${allIRTbinary}"==""		local hybridsyntax2 "(grm ${allIRTordinal})"
if "${allIRTnominal}"!=""		local hybridsyntax2 "`hybridsyntax2' (nrm ${allIRTnominal})"
dis `"eststo allIRT:	irt hybrid 	`hybridsyntax2'			// , vce(cluster mergeid)	"'

eststo allIRT:	irt hybrid 	`hybridsyntax2'			// , vce(cluster mergeid)			
	if `e(converged)'==0 	pause Model not converged
	estat report, byparm sort(b)
	matrix allIRTout = r(table)'
	matrix allIRTout = allIRTout[1...,1], allIRTout[1...,5], allIRTout[1...,6]
predict p_allIRT, latent ebmeans		// Empirical Bayes posterior estimate of latent variable
	label var p_allIRT "DISABILITY SCORE using IRT measure with more indicators (hybrid / 2-parameter)"
	
	
// Outputs
esttab IRT* using "${dodir}\Outputs\2_dis_weights_IRTregs.csv", csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)  
capture esttab matrix(IRTout) 	  using "${dodir}\Outputs\2_dis_weights_IRT.csv"   , csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)
capture esttab matrix(allIRTout)  using "${dodir}\Outputs\2_dis_weights_allIRT.csv", csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)


		
*_____________________________________________________________________________________________________________________________________________________
*
**# TURNING PROBAILITIES/SCALES INTO DISABILITY VARS
*_____________________________________________________________________________________________________________________________________________________

// PREDICTED:  Creating the predicted disability variable in the first instance 
// NOTE: this is done each time within the bootstrap, only done here to enable us to create graphs (below) and test out the bootstrap syntax in advance
* Setting an initial random var
capture drop rand1
set seed 1982
gen rand1 = runiform()  // Random variables for sample iteration
	label var rand1	"Random values, set in 4_dis_weights.do (but re-done within the bootstrap)"
* And the variable itself
gen 		${predictedvar} 	= 0 if 						  ~missing(p_predicted)           
	replace ${predictedvar} 	= 1 if p_predicted > rand1  & ~missing(p_predicted)
	label var ${predictedvar} "Disability - regression-based measure using ${disvar}"
	* tab ${disvar} predicted_${disvar}, cell nof


// 	FIXED VARS: Setting % disability to be the same as observed disability	
// 		New version with more precisely matched weighted prevalence than possible using _pctile (which requires using mat2txt)
// 		The svy: tab command is extremely slow, so there's various display flags so that you know where it crashes (if it crashes)

* Firstly, get the proportion of people who report a disability
	svy: prop ${disvar}
	matrix output = e(b)
	global prop_dis			= 100 - (100*output[1,2])

* Latent disability
foreach v in IRT allIRT	{
	capture drop ${disvar}_`v'?			// just in case running this twice, we need to drop the variables we're about to create
	// Ensure that varying precision in globals doesn't cause problems 
		replace p_`v' = round(p_`v', 0.000001)
		recast double p_`v'					// to avoid rounding problems - see help precision
	// Find the cutpoint on the latent-variable score that produces the same level of disability
		* Firstly do _pctile, to get the lower bounded version 
		_pctile p_`v' if !missing(${disvar}) [pw=${pweight}], percentiles(${prop_dis})
			scalar cutoff = `r(r1)'
		gen byte ${disvar}_`v'L = (p_`v' > cutoff + 0.0000001)		// see help precision for why the + 0.0000001 is added
			label var ${disvar}_`v'L				"Disability - latent measure (just below threshold)"
		* Then find the upper bounded version (from the next-lowest value of p_`v' going in reverse order)
		preserve
			collapse (percent) prev=${disvar} (count) count=${disvar} [pw=${pweight}] if !missing(${disvar}), by(p_`v')
			gen prevvalue = p_`v'[_n-1] if abs(p_`v'-cutoff)<0.0000001		// see help precision to understand why "if p_`v'==pred_cutoff" doesn't quite work here
			sum prevvalue 
			scalar prevvalue = `r(mean)'
		restore
		gen byte ${disvar}_`v'H = (p_`v' > (prevvalue + 0.0000001) )		// see help precision for why the + 0.0000001 is added
			label var ${disvar}_`v'H				"Disability - latent measure (just above threshold)"
	dis "Cutoffs for `v' are" _newline(1) "L=" cutoff _newline(1) "H=" prevvalue
/**/						}

* Predicted disability, fixed version
	capture drop ${disvar}_predicted_fxd?			// just in case running this twice, we need to drop the variables we're about to create
	// Ensure that varying precision in globals doesn't cause problems 
		replace p_predicted = round(p_predicted, 0.000001)
		recast double p_predicted 					// to avoid rounding problems - see help precision
	// Find the cutpoint on the latent-variable score that produces the same level of disability
		* Firstly do _pctile, to get the lower bounded version 
		_pctile p_predicted if !missing(${disvar}) [pw=${pweight}], percentiles(${prop_dis})
			scalar pred_cutoff = `r(r1)'
		gen byte ${disvar}_predicted_fxdL = (p_predicted > pred_cutoff + 0.0000001)	// see help precision for why the + 0.0000001 is added
			label var ${disvar}_predicted_fxdL				"Disability - regression-based measure WITHOUT random element (just below threshold)"
		* Then find the upper bounded version (from the next-lowest value of p_predicted going in reverse order)
		preserve
			collapse (percent) prev=${disvar} (count) count=${disvar} [pw=${pweight}] if !missing(${disvar}), by(p_predicted)
			gen prevvalue = p_predicted[_n-1] if abs(p_predicted-pred_cutoff)<0.0000001		// see help precision to understand why "if p_predicted==pred_cutoff" doesn't quite work here
			sum prevvalue 
				scalar pred_prevvalue = `r(mean)'
		restore
		gen byte ${disvar}_predicted_fxdH = (p_predicted > (pred_prevvalue + 0.0000001) )	// the 0.0000001 is because of precision errors, even with the syntax above...
			label var ${disvar}_predicted_fxdH				"Disability - regression-based measure WITHOUT random element (just above threshold)"
dis "Cutoffs are" _newline(1) "L=" pred_cutoff _newline(1) "H=" pred_prevvalue

	
// DECIDING WHETHER JUST ABOVE OR JUST BELOW THE CUT-OFF IS BEST
svy: mean ${disvar}* ${WGvar} if !missing(${disvar})
* Decision as of 11th July 2023, for llsiH
local whichIRT 		= word("${whichversions}", 2)
local whichallIRT 	= word("${whichversions}", 4)
local whichpredfxd 	= word("${whichversions}", 6)
rename ${IRTvar}`whichIRT' 					${IRTvar}
rename ${allIRTvar}`whichIRT' 				${allIRTvar}
rename ${predictedvar}_fxd`whichpredfxd'	${predictedvar}_fxd 
* Quick check
svy: mean ${disvar}* ${WGvar} if !missing(${disvar})
pause Requires checking by hand that these are the right choices
drop ${IRTvar}? ${allIRTvar}? ${predictedvar}_fxd? 


// For alternative versions (e.g. to match World Disability Report) or checks (e.g. whether regime dis prevalence varies by cutpoint), see 9_checks_and_alternatives.do

		
*_____________________________________________________________________________________________________________________________________________________
*
**# CLEANING UP
*_____________________________________________________________________________________________________________________________________________________

drop rand1
capture order rllsi_WG, before(llsiH_pred)

